** Introduction Web Scraping ** Web scraping is a technique for converting the data present in unstructured format (HTML tags) over the web to the structured format which can easily be accessed and used. Most of the data available over the web is not readily available. It is present in an unstructured format (HTML format) and is not downloadable. Therefore, it requires knowledge and expertise to use this data.
We can locate useful data based on their CSS selectors, especially when the webpage uses semantic tag attributes. We can use [selectorgadget] (http://selectorgadget.com/) to find out which css selector matches the “review”. SelectGadget can be added an extension in Google chrome. It is shown as a magnifying glass.
pacman::p_load(tidyverse,tidytext,viridis,rvest,tm,wordcloud,SnowballC,tidyquant)
We can specify the css selector in html_nodes() and extract the text with html_text(). We scrab over 1800 reviews of Fiat Chrysler Automobiles from glassdoor. There are about 155 webpages which contain these reviews.
n=155
#The reviews has 155 pages,thus n=155
FCA_urls <- paste0("https://www.glassdoor.com/Reviews/FCA-Fiat-Chrysler-Automobiles-Reviews-E149_P",seq(2, n), ".htm")
FCA_urls<-c("https://www.glassdoor.com/Reviews/FCA-Fiat-Chrysler-Automobiles-Reviews-E149.htm",FCA_urls)
FCA_html <- FCA_urls %>%
map_chr(~ read_html(.) %>% html_node(".hreview")%>%html_text())
FCA_html[[1]]
[1] "Featured Review Helpful (1)\"love the company has taught me a lot of new information in my contained growth as a mechanic\"StarStarStarStarStarCurrent Employee - Service Technician in Dayton, OHCurrent Employee - Service Technician in Dayton, OHI have been working at FCA Fiat Chrysler Automobiles full-time (More than 3 years)Prospros are that the company is great, pays for your training that will help in advancement. I get to work on the new cars and technology in the automotive fieldConsno cons as of yet, I love the work I do.and the folk I work with. plan on staying with the company through retirementAdvice to Managementkeep up the good workShare on FacebookShare on TwitterShare on WhatsAppShare via EmailCopy LinkLink Copied!Flag as InappropriateFlag as InappropriateHelpful (1) FCA Fiat Chrysler Automobiles Response seconds ago Edit • Delete FCA Fiat Chrysler Automobiles 2017-09-30 21:14 PDT"
We can remove all unwanted characters at this stage
#Data-Preprocessing: removing '\n'
FCA_html<-gsub("\n","",FCA_html)
#remove all round brackets
FCA_html<-FCA_html%>%str_replace_all("\\(|\\)", "")
#remove all \\
FCA_html<-FCA_html%>%str_replace_all("\\\\", "")
#remove all non words and non numbers
#FCAindeed2<-FCAindeed2%>%str_replace_all("[^A-Za-z0-9]", "")
#remove all •
FCA_html<-FCA_html%>%str_replace_all("\\• ", "")
#remove all &
FCA_html<-FCA_html%>%str_replace_all("\\ & ", "")
#remove all non printable words
FCA_html<-FCA_html%>%str_replace_all("[^[:print:]]", "")
FCA_html<-FCA_html%>%gsub(pattern = "\\ /", replacement = "")
#FCAindeed2<-FCAindeed2%>%stringi::stri_unescape_unicode()
FCA_html[[1]]
[1] "Featured Review Helpful 1\"love the company has taught me a lot of new information in my contained growth as a mechanic\"StarStarStarStarStarCurrent Employee - Service Technician in Dayton, OHCurrent Employee - Service Technician in Dayton, OHI have been working at FCA Fiat Chrysler Automobiles full-time More than 3 yearsProspros are that the company is great, pays for your training that will help in advancement. I get to work on the new cars and technology in the automotive fieldConsno cons as of yet, I love the work I do.and the folk I work with. plan on staying with the company through retirementAdvice to Managementkeep up the good workShare on FacebookShare on TwitterShare on WhatsAppShare via EmailCopy LinkLink Copied!Flag as InappropriateFlag as InappropriateHelpful 1 FCA Fiat Chrysler Automobiles Response seconds ago Edit Delete FCA Fiat Chrysler Automobiles 2017-09-30 21:14 PDT"
get_sentiments(lexicon = "nrc")%>%
count(sentiment, sort = TRUE)
Convert the text data to dataframe.
GlassdoorPages <- data_frame(page = seq(1, n),
text = c(FCA_html))
GlassdoorPages%>%head()
Now we have the letters, and can convert this to a tidy text format.
tidy_FCA <- GlassdoorPages %>%
unnest_tokens(word, text) %>%
add_count(page) %>%
dplyr::rename(page_total = n)
#remove stop words
data("stop_words")
tidy_FCA <- tidy_FCA %>%
anti_join(stop_words)
stop_word=data_frame(word=c("chrysler","fca","linklink","fiat","whatsappshar","auburn","twittershar"))
tidy_FCA <- tidy_FCA %>%
anti_join(stop_words)
tidy_FCA%>%head()
Next, let’s implement the sentiment analysis.
FCA_sentiment <- tidy_FCA %>%
inner_join(get_sentiments("nrc"))
FCA_sentiment%>%head()
Now we have all we need to see the relative changes in these sentiments over the years.
theme_set(theme_bw())
#Alternatively
#FCA_sentiment%>%group_by(page, page_total, sentiment)%>%count()
FCA_sentiment %>%
count(page, page_total, sentiment) %>%
filter(sentiment %in% c("positive", "negative",
"joy", "trust","fear","sadness"))%>%
mutate(sentiment = as.factor(sentiment)) %>%
#ggplot(aes(page, n / page_total, fill = sentiment)) +
ggplot(aes(page, n / sum(n), fill = sentiment)) +
geom_area(position = "identity", alpha = 0.5) +
labs(y = "Relative frequency", x = "Page",
title = "Sentiment analysis of FCA Glassdoor Reviews",
subtitle = "Using the nrc lexicon")+theme_bw()+
scale_fill_manual(values=viridis_pal(option = "D")(6))+
scale_y_continuous(labels = scales::percent)
FCA_sentiment %>%
count(page, page_total, sentiment) %>%
# filter(sentiment %in% c("positive", "negative", "joy", "trust","fear","sadness"))%>%
mutate(sentiment = forcats::fct_lump(sentiment, 6))%>%
#mutate(sentiment = as.factor(sentiment)) %>%
ggplot(aes(page, n / page_total, fill = sentiment)) +
#ggplot(aes(page, n / sum(n), fill = sentiment)) +
geom_area(position = "identity", alpha = 0.5) +
labs(y = "Relative frequency", x = "Page",
title = "Sentiment analysis of FCA Glassdoor Reviews",
subtitle = "Using the nrc lexicon")+theme_bw()+
scale_fill_manual(values=viridis_pal(option = "A")(7))+
scale_y_continuous(labels = scales::percent)
tidy_FCA %>%
inner_join(get_sentiments("afinn")) %>%
group_by(page) %>%
summarize(average_sentiment = mean(score), words = n()) %>%
#filter(words >= 10) %>%
ggplot(aes(page, average_sentiment)) +
geom_line() +
geom_hline(color = "red", lty = 2, yintercept = 0) +
labs(y = "Average AFINN sentiment score", x = "Page",
title = "Sentiment analysis of FCA Glassdoor Reviews",
subtitle = "Using the affin lexicon")
FCA_sentiment %>%
count(sentiment, word) %>%
filter(sentiment %in% c("positive", "negative",
"joy", "trust","fear","sadness")) %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup %>%
mutate(word = reorder(word, n)) %>%
mutate(sentiment = as.factor(sentiment)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_bar(alpha = 0.8, show.legend = FALSE,stat = "identity") +
coord_flip() +
scale_y_continuous(expand = c(0,0)) +
facet_wrap(~sentiment, scales = "free") +
labs(y = "Total number of occurrences", x = "",
title = "Sentiment analysis of FCA Glassdoor Reviews",
subtitle = "Using the nrc lexicon")+theme_bw()+
scale_fill_manual(values=viridis_pal(option = "D")(6))
# # change text into italics
# theme(strip.text = element_text(face = "italic")) +
# # strip horizontal axis labels
# theme(axis.title.x=element_blank()) +
# theme(axis.ticks.x=element_blank()) +
# theme(axis.text.x=element_blank())
FCA_sentiment %>%
count(page, page_total, sentiment) %>%
filter(sentiment %in% c("positive", "negative",
"joy", "trust","fear","sadness"))%>%
mutate(sentiment = factor(sentiment, levels = c("negative",
"positive",
"joy", "trust","fear","sadness"))) %>%
ggplot(aes(page, n / page_total, fill = sentiment)) +
geom_area(position = "identity", alpha = 0.5) +
labs(y = "Relative frequency", x = NULL,
title = "Sentiment analysis of FCA Glassdoor Reviews",
subtitle = "Using the nrc")+theme_bw()
FCA_sentiment <- tidy_FCA %>%
inner_join(get_sentiments("bing"))
FCA_sentiment %>%
count(page, page_total, sentiment)%>%
mutate(sentiment = as.factor(sentiment))%>%
ggplot(aes(page, n / page_total, fill = sentiment)) +
geom_area(position = "identity", alpha = 0.5) +
labs(y = "Relative frequency", x = "Page",
title = "Sentiment analysis of FCA Glassdoor Reviews",
subtitle = "Using the nrc")+theme_bw()+
# scale_fill_manual(values=viridis_pal(option = "plasma")(2))+
scale_y_continuous(labels = scales::percent)
The negative and positive sentiments distribution is similar with the negative sentiments having a higher peak. The negative reviews is evenly distributed as like the positive reviews. Neither is clearly superior over the other.
GlassdoorPages %>%
unnest_tokens(word, text)%>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE)%>%
spread(sentiment,n,fill=0)%>%
mutate(sentiment = positive -negative)%>%
ggplot(aes(x = sentiment)) +
geom_density(color = palette_light()[1], fill = palette_light()[1], alpha = 0.8) +
theme_tq()+xlim(c(-5,5))
FCA_sentiment %>%
count(sentiment, word) %>%
group_by(sentiment) %>%
top_n(15) %>%
ungroup %>%
mutate(word = reorder(word, n)) %>%
mutate(sentiment = as.factor(sentiment)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(alpha = 0.8, show.legend = FALSE) +
coord_flip() +
scale_y_continuous(expand = c(0,0)) +
facet_wrap(~sentiment,scales="free") +
labs(y = "Total number of occurrences", x = "",
title = "Sentiment analysis of FCA Glassdoor Reviews",
subtitle = "Using the bing lexicon")+
#scale_fill_manual(values=viridis_pal(option = "D")(8))+
scale_fill_viridis(end = 0.75, discrete=TRUE, direction = -1) +
scale_x_discrete(expand=c(0.02,0)) +
theme(strip.text=element_text(hjust=0)) +
# change text into italics
theme(strip.text = element_text(face = "italic")) +
# strip horizontal axis labels
theme(axis.title.x=element_blank()) +
theme(axis.ticks.x=element_blank()) +
theme(axis.text.x=element_blank())+
theme_minimal(base_size = 13)
bing_word_counts <-tidy_FCA %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
bing_word_counts%>%spread(sentiment,n,fill = 0)%>%top_n(10)
bing_word_counts%>%spread(sentiment,n,fill = 0)%>%top_n(-10)%>%head(10)
bing_word_counts %>%
filter(n > 3) %>%
mutate(n = if_else(sentiment == "negative", -n, n)) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ylab("Contribution to sentiment")+
#scale_fill_manual(values=viridis_pal(option = "D")(2))
scale_fill_viridis(end = 0.85, discrete=TRUE, direction = 1)
# n=4
#
# FCA=list()
#
# for (i in 2:n){
#
# FCA[[1]]=read_html("https://www.glassdoor.com/Reviews/FCA-Fiat-Chrysler-Automobiles-Reviews-E149.htm")%>% html_nodes(".hreview")%>%html_text(trim = TRUE)
#
# FCA[[i]]=read_html(paste("https://www.glassdoor.com/Reviews/FCA-Fiat-Chrysler-Automobiles-Reviews-E149_P",i,".htm",sep = ""))%>% html_nodes(".hreview")%>%html_text(trim = TRUE)
#
# }
We see negative sentiment spiking, higher than positive sentiment, during the financial upheaval of 2008, the collapse of the dot-com bubble in the early 2000s, and the recession of the 1990s. Overall, though, notice that the balance of positive to negative sentiment is not as skewed to positive as when you use one of the general purpose sentiment lexicons.
This happens because of the words that are driving the sentiment score in these different cases. When using the financial sentiment lexicon, the words have specifically been chosen for a financial context. What words are driving these sentiment scores?
#FCA_html2<-FCA_html%>%str_replace_all("[[:xdigit:]]", "")
corpus = Corpus(VectorSource(FCA_html))
corpus = tm_map(corpus, tolower)
corpus<- tm_map(corpus, stripWhitespace)
corpus = tm_map(corpus, removeNumbers)
corpus = tm_map(corpus, removeWords, stopwords("english"))
stop_user=c("chrysler","fca","linklink","fiat","whatsappshar","auburn","twittershar","automobil","edit","delet","via","edit","delet","via","starstarstarstarstarwork","pdt","hill","facebookshar")
corpus = tm_map(corpus, removeWords,stop_user )
tdm <- TermDocumentMatrix(corpus,
control = list(removePunctuation = TRUE,
stopwords = TRUE,
removeNumbers = TRUE, tolower = TRUE,
PlainTextDocument=TRUE,
stripWhitespace=TRUE, stemming = TRUE))
inspect(tdm)
<<TermDocumentMatrix (terms: 2144, documents: 155)>>
Non-/sparse entries: 9513/322807
Sparsity : 97%
Maximal term length: 31
Weighting : term frequency (tf)
Sample :
Docs
Terms 113 115 116 118 27 33 43 68 84 87
ago 2 1 2 1 1 1 1 1 1 2
automobil 2 2 2 2 3 2 2 2 3 3
copi 1 1 1 1 1 1 1 1 1 1
delet 1 1 1 1 1 1 1 1 1 1
emailcopi 1 1 1 1 1 1 1 1 1 1
employe 3 3 2 2 4 4 6 3 2 3
life 1 1 3 1 2 1 1 1 1 2
respons 1 1 1 1 1 1 1 1 1 1
second 1 1 1 1 2 1 1 1 1 1
work 4 1 2 1 8 0 3 2 3 3
tidy(tdm)
tdm = as.matrix(tdm)
frequencies = DocumentTermMatrix(corpus)
frequencies
<<DocumentTermMatrix (documents: 155, terms: 2640)>>
Non-/sparse entries: 10365/398835
Sparsity : 97%
Maximal term length: 32
Weighting : term frequency (tf)
findFreqTerms(frequencies, lowfreq=100)
[1] "ago" "automobiles"
[3] "copied" "delete"
[5] "edit" "emailcopy"
[7] "employee" "facebookshare"
[9] "flag" "inappropriateflag"
[11] "inappropriatehelpful" "pdt"
[13] "response" "seconds"
[15] "time" "twittershare"
[17] "via" "whatsappshare"
[19] "work" "balanceculturevaluescareer"
[21] "life" "opportunitiescompbenefitssenior"
[23] "starstarstarstarstarwork" "anonymous"
#%>%as_tibble()%>%top_n(10)
Remove sparse terms
sparse = removeSparseTerms(frequencies, 0.995)
sparse
<<DocumentTermMatrix (documents: 155, terms: 2640)>>
Non-/sparse entries: 10365/398835
Sparsity : 97%
Maximal term length: 32
Weighting : term frequency (tf)
What about associations between words? Let’s have a look at what other words had a high association with “love”.
findAssocs(frequencies, c("love","poor","flexible","horrible"), c(0.6,0.6,0.6,0.6))
$love
technician dayton featured fieldconsno ohi
0.69 0.63 0.63 0.63 0.63
retirementadvice taught workshare yearsprospros consworld
0.63 0.63 0.63 0.63 0.63
dramaticallyshare normally related scientific test
0.63 0.63 0.63 0.63 0.63
$poor
seen entire since advances allowed cheapest
0.67 0.67 0.64 0.63 0.63 0.63
choice clue creation currect decides developed
0.63 0.63 0.63 0.63 0.63 0.63
drained efforts ethics executed exited experiences
0.63 0.63 0.63 0.63 0.63 0.63
fantastic feature final flounder freedom gouge
0.63 0.63 0.63 0.63 0.63 0.63
hair hammers highlighted incorporate initial interest
0.63 0.63 0.63 0.63 0.63 0.63
items labor medical merger mirrors outstanding
0.63 0.63 0.63 0.63 0.63 0.63
participate partners perception price questionable recent
0.63 0.63 0.63 0.63 0.63 0.63
reduce requested shown sighted sink site
0.63 0.63 0.63 0.63 0.63 0.63
slogans smoke stimulate swim talked tco
0.63 0.63 0.63 0.63 0.63 0.63
viability view warranty washington continue
0.63 0.63 0.63 0.63 0.62
$flexible
numeric(0)
$horrible
old office
0.62 0.60
wc=Corpus(VectorSource(FCA_html[1:4]))
wc <- TermDocumentMatrix(wc)
wc=as.matrix(wc)
comparison.cloud(wc,scale=c(4,.5),max.words=300,
random.order=FALSE,rot.per=.1,
colors=palette_light()[1:4],
use.r.layout=FALSE,title.size=3)
The most commonly words used in the reviews is plotted below.
v <-sort(rowSums(tdm),decreasing=TRUE)
d <-data_frame(word = names(v),freq=v) %>%mutate(word = reorder(word, freq))
head(d, 10)
wordcloud(words = d$word, freq = d$freq, min.freq = 3,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
d=d[1:20,]
ggplot(d, aes(x=word,y=freq,fill="")) +
geom_bar(stat="identity")+theme_bw()+
theme(axis.text.x =element_text(angle =45,hjust = 1))+
#scale_fill_viridis(end = 0.85, discrete=TRUE, direction = 1)
#scale_fill_manual(values=viridis_pal(option = "D")(1))
scale_color_manual(values = palette_light()) +
scale_fill_manual(values = palette_light())+
coord_flip()
GlassdoorPages %>%
unnest_tokens(word, text)%>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
reshape2::acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = viridis_pal(option = "D")(2),
max.words = 100)
GlassdoorPages %>%
unnest_tokens(word, text)%>%
inner_join(get_sentiments("nrc")) %>%
count(word, sentiment, sort = TRUE)%>%
spread( word,n,fill = 0)
GlassdoorPages %>%
unnest_tokens(word, text)%>%
inner_join(get_sentiments("nrc")) %>%
count(word, sentiment, sort = TRUE)%>%
filter(sentiment %in% c("negative","positive","joy","sadness"))%>%
reshape2::acast(word ~ sentiment, value.var = "n", fill = 0)%>%
comparison.cloud(colors = viridis_pal(option = "D")(4),
max.words = 200)